{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import fasttext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = fasttext.train_supervised('train-bahasa.txt', dim = 16, minn = 2, loss = 'hs')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_model(\"fasttext-malaya.bin\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4122457"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(model.get_words())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 70 µs, sys: 1 µs, total: 71 µs\n",
      "Wall time: 78.2 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__manglish', '__label__malay'), array([0.97094667, 0.02911203]))"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"can lah\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.quantize(input='train-bahasa.txt', retrain=True, qnorm=True, \n",
    "               cutoff = int(len(model.get_words()) * 0.2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_model(\"fasttext-malaya.ftz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "\n",
    "bucketName = 'huseinhouse-storage'\n",
    "Key = 'fasttext-malaya.bin'\n",
    "outPutname = \"v34/language-detection/fasttext-malaya.bin\"\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "bucketName = 'huseinhouse-storage'\n",
    "Key = 'fasttext-malaya.ftz'\n",
    "outPutname = \"v34/language-detection/fasttext-malaya.ftz\"\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['__label__malay',\n",
       " '__label__other',\n",
       " '__label__ind',\n",
       " '__label__eng',\n",
       " '__label__rojak',\n",
       " '__label__manglish']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 91 µs, sys: 1 µs, total: 92 µs\n",
      "Wall time: 97.5 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__ind', '__label__other', '__label__eng'),\n",
       " array([1.00002944e+00, 1.04769351e-05, 1.00695570e-05]))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"nnti u tolong i ok\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 70 µs, sys: 0 ns, total: 70 µs\n",
      "Wall time: 77.5 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__eng', '__label__manglish', '__label__malay'),\n",
       " array([9.97947037e-01, 9.92110930e-04, 6.68452645e-04]))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"give me some water pls\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 44 µs, sys: 0 ns, total: 44 µs\n",
      "Wall time: 48.4 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__malay', '__label__eng', '__label__rojak'),\n",
       " array([0.96453685, 0.03313386, 0.00229743]))"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"bodoh tu simpan skit pls\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 43 µs, sys: 1 µs, total: 44 µs\n",
      "Wall time: 47.4 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__other', '__label__eng', '__label__malay'),\n",
       " array([9.99994755e-01, 3.13595920e-05, 1.39339018e-05]))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"我该\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
